Load required packages
Get paths to 20 WGBS bedGraph files for 10 matching prostate tumour and normal prostate sample pairs
bedgraphs = list.files(system.file("extdata", "prostate_cancer_chr11_wgbs_bedgraphs",
package = "methodical"), full.names = T)Methrix requires that bedGraphs have a coverage column. This is added using add_dummy_coverage_to_bedgraphs().
add_dummy_coverage_to_bedgraphs(bedgraphs = bedgraphs, output_directory = "bedgraphs_with_dummy_coverage",
ncores = 4)Get CpGs for the hg38 genome build
Create annotation for samples
# Create sample annotation
sample_annotation = data.frame(condition = c(rep("Normal", 10),
rep("Tumour", 10)), patient = rep(c("1", "10", "12", "18",
"19", "26", "28", "31", "32", "6"), times = 2), row.names = gsub("_WGBS_chr11.bg.gz",
"", basename(bedgraphs)))Create HDF5Array from prostate bedGraphs for chr11
Load CpG islands for hg38
hg38_cgis = rtracklayer::import.bed(system.file("extdata", "masked_cpg_islands_hg38.bed",
package = "methodical"))Get methylation values for CpGs in CpG islands on chromosome 11
prostate_chr11_cgi_methylation = extract_cpg_values_from_methrix(methrix = prostate_methrix,
regions = hg38_cgis[seqnames(hg38_cgis) == "chr11"])
#> -Subsetting by genomic regionsCalculate mean methylation in CpG islands on chr11 across 2 MB windows
prostate_chr11_cgi_methylation_2mb_windows = cpg_window_summary(cpg_values = prostate_chr11_cgi_methylation,
window_size = 2 * 10^6, min_cpgs_per_window = 10)Plot methylation of CpG islands for a subset of normal and tumour samples showing all samples with separate colours
sample_subset = c("N1", "N10", "N12", "T1", "T10", "T12")
plot_cpg_methylation(cpg_values = prostate_chr11_cgi_methylation_2mb_windows,
geom = "geom_linepoint", samples = sample_subset, group_colours = c(RColorBrewer::brewer.pal(3,
"Blues"), RColorBrewer::brewer.pal(3, "Reds")), title = "Chr11 CpG island Methylation in Prostate Tumour and Normal Samples")Now use sample_groups to use separate colours for normal and tumour samples
plot_cpg_methylation(cpg_values = prostate_chr11_cgi_methylation_2mb_windows,
geom = "geom_linepoint", samples = sample_subset, sample_groups = colData(prostate_methrix)[sample_subset,
"condition"], group_colours = c("#00BFC4", "#F8766D"),
title = "Chr11 CpG island Methylation in Prostate Tumour and Normal Samples")We can also show boxplots instead of the line graphs
plot_cpg_methylation(cpg_values = prostate_chr11_cgi_methylation_2mb_windows,
geom = "geom_boxplot", samples = sample_subset, sample_groups = colData(prostate_methrix)[sample_subset,
"condition"], group_colours = c("#00BFC4", "#F8766D"),
title = "Chr11 CpG island Methylation in Prostate Tumour and Normal Samples")Calculate mean CpG island methylation difference between tumour and normal samples and plot
prostate_chr11_cgi_methylation_2mb_windows$mean_tumour_normal_difference = rowMeans(select(prostate_chr11_cgi_methylation_2mb_windows,
starts_with("T")), na.rm = T) - rowMeans(select(prostate_chr11_cgi_methylation_2mb_windows,
starts_with("N")), na.rm = T)
plot_cpg_methylation(cpg_values = prostate_chr11_cgi_methylation_2mb_windows,
geom = "geom_linepoint", samples = "mean_tumour_normal_difference",
group_colours = "darkblue", title = "Mean Chr11 CpG island Methylation Change in Prostate Tumour Samples")Get the Transcription start site for GSTP1 (using the ENST00000398606 transcript) from biomaRt
mart = useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
gstp1_tss_biomart_result = getBM(mart = mart, attributes = c("chromosome_name",
"transcription_start_site"), filters = "ensembl_transcript_id",
values = list("ENST00000398606"))
# Create a GRanges object for the GSTP1 TSS and set
# chromosome style to UCSC so that it matches the
# chromosome style in the bedGraphs and methrix object.
gstp1_tss_gr = GenomicRanges::GRanges(seqnames = gstp1_tss_biomart_result$chromosome_name,
ranges = IRanges(gstp1_tss_biomart_result$transcription_start_site))
seqlevelsStyle(gstp1_tss_gr) = "UCSC"Extract methylation values for CpGs within 5000 bp upstream and 2000 downstream of GSTP1 TSS
gstp1_methylation = extract_cpg_values_from_methrix(methrix = prostate_methrix,
regions = expand_granges(gstp1_tss_gr, 5000, 2000))
#> -Subsetting by genomic regions
# Calculate the mean methylation difference at GSTP1 CpG
# sites between tumour and normal samples and plot
gstp1_methylation_tumour_normal_difference = dplyr::transmute(gstp1_methylation,
mean_tumour_normal_difference = rowMeans(dplyr::select(gstp1_methylation,
starts_with("T")), na.rm = T) - rowMeans(dplyr::select(gstp1_methylation,
starts_with("N")), na.rm = T))Load table of transcript TPM values
transcript_tpm = read.table(system.file("extdata", "prostate_tumour_normal_transcript_tpm.tsv.gz",
package = "methodical"), header = T, sep = "\t", row.names = 1)
gstp1_tpm = unlist(transcript_tpm["ENST00000398606", ])Get Spearman correlation of methylation of CpGs in vicinity of GSTP1 TSS with GSTP1 TPM
gstp1_cpg_methylation_correlation = cpg_feature_cor(feature = gstp1_tpm,
cpg_table = gstp1_methylation, method = "s")
# Add distance of CpGs to ENST00000398606 TSS to the table
gstp1_cpg_methylation_correlation$tss_distance = cpg_distances_to_region(query_region = gstp1_tss_gr,
cpg_names = row.names(gstp1_cpg_methylation_correlation))
# Take a look at the top of the table
head(gstp1_cpg_methylation_correlation)
#> correlation p_value q_value tss_distance
#> chr11:67578857 0.50190141 0.02414048 0.04353201 -4955
#> chr11:67578878 0.46704897 0.03787681 0.06561338 -4934
#> chr11:67578909 0.38958433 0.08952002 0.13676670 -4903
#> chr11:67578938 0.01963022 0.93453238 0.93490341 -4874
#> chr11:67578950 -0.13857203 0.56014065 0.61925097 -4862
#> chr11:67578954 -0.05751159 0.80968102 0.83817529 -4858Plot correlation of methylation and transcription for CpG sites near GSTP1 TSS showing CpG position on chr11 on the x-axis
plot_cpg_methylation(cpg_values = gstp1_cpg_methylation_correlation,
geom = "geom_linepoint", samples = "correlation", title = "Correlation of CpG Methylation with GSTP1 Transcription")Create same plot except showing distance to TSS on the x-axis by giving the GSTP1 TSS as the reference_region
gstp1_tss_cpg_correlation_plot = plot_cpg_methylation(cpg_values = gstp1_cpg_methylation_correlation,
geom = "geom_linepoint", samples = "correlation", group_colours = "#66C2A5",
reference_region = gstp1_tss_gr, title = "Correlation of CpG Methylation with GSTP1 Transcription",
xlabel = "Distance to TSS", ylabel = "Spearman Correlation") +
geom_hline(yintercept = 0, linetype = "dashed")
gstp1_tss_cpg_correlation_plotgstp1_tss_cpg_methylation_plot = plot_cpg_methylation(cpg_values = gstp1_methylation_tumour_normal_difference,
geom = "geom_linepoint", samples = "mean_tumour_normal_difference",
group_colours = "#5E4FA2", reference_region = gstp1_tss_gr,
title = "CpG Methylation Close to GTSP1 TSS", xlabel = "Distance to TSS") +
geom_hline(yintercept = 0, linetype = "dashed")
gstp1_tss_cpg_methylation_plotplotly can be used to create an interactive plot which shows CpG/region name. Here we show just the name of the CpG sites with the tooltip.
hide_legend(ggplotly(gstp1_tss_cpg_methylation_plot, tooltip = c("position_name")))
#> Warning: `gather_()` was deprecated in tidyr 1.2.0.
#> Please use `gather()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.Combine methylation and correlation plots to investigate the relationship between CpG methylation-transcription correlation values and methylation change in prostate cancer
We can explore the relationship between methylation of individual CpG sites and related features by creating a scatter plot with methylation_feature_scatter_plot(). We’ll make a plot for methylation of the CpG chr11:67583741 and GSTP1 TPM
methylation_feature_scatter_plot(cpg_values = gstp1_methylation,
cpg_name = "chr11:67583741", feature = log(gstp1_tpm), title = "GSTP1 Expression and chr11:67583741 Methylation",
xlab = "Methylation Value", ylab = "Log GSTP1 TPM", add_correlation = T,
method = "p")
#> Warning: Removed 2 rows containing non-finite values (stat_cor).
#> Warning: Removed 2 rows containing missing values (geom_point).
# We can make the same plot colouring the normal and tumour
# samples and showing a separate regression line for them
methylation_feature_scatter_plot(cpg_values = gstp1_methylation,
cpg_name = "chr11:67583741", feature = log(gstp1_tpm), title = "GSTP1 Expression and chr11:67583741 Methylation",
xlab = "Methylation Value", ylab = "Log GSTP1 TPM", regression_line = T,
add_correlation = T, method = "p", sample_groups = colData(prostate_methrix)$condition)
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 2 rows containing non-finite values (stat_smooth).
#> Warning: Removed 2 rows containing non-finite values (stat_cor).
#> Warning: Removed 2 rows containing missing values (geom_point).sessionInfo()
#> R version 4.2.0 (2022-04-22)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] BSgenome.Hsapiens.UCSC.hg38_1.4.4 BSgenome_1.64.0
#> [3] rtracklayer_1.56.0 Biostrings_2.64.0
#> [5] XVector_0.36.0 ggpubr_0.4.0
#> [7] plotly_4.10.0 biomaRt_2.52.0
#> [9] methodical_0.0.0.9000 ggplot2_3.3.6
#> [11] methrix_1.10.0 SummarizedExperiment_1.26.1
#> [13] Biobase_2.56.0 GenomicRanges_1.48.0
#> [15] GenomeInfoDb_1.32.2 IRanges_2.30.0
#> [17] S4Vectors_0.34.0 BiocGenerics_0.42.0
#> [19] MatrixGenerics_1.8.0 matrixStats_0.62.0
#> [21] data.table_1.14.2
#>
#> loaded via a namespace (and not attached):
#> [1] backports_1.4.1 BiocFileCache_2.4.0
#> [3] plyr_1.8.7 lazyeval_0.2.2
#> [5] splines_4.2.0 crosstalk_1.2.0
#> [7] BiocParallel_1.30.2 usethis_2.1.6
#> [9] digest_0.6.29 foreach_1.5.2
#> [11] htmltools_0.5.2 fansi_1.0.3
#> [13] magrittr_2.0.3 memoise_2.0.1
#> [15] doParallel_1.0.17 remotes_2.4.2
#> [17] R.utils_2.11.0 prettyunits_1.1.1
#> [19] colorspace_2.0-3 blob_1.2.3
#> [21] rappdirs_0.3.3 xfun_0.31
#> [23] dplyr_1.0.9 callr_3.7.0
#> [25] crayon_1.5.1 RCurl_1.98-1.6
#> [27] jsonlite_1.8.0 iterators_1.0.14
#> [29] glue_1.6.2 gtable_0.3.0
#> [31] zlibbioc_1.42.0 DelayedArray_0.22.0
#> [33] car_3.0-13 pkgbuild_1.3.1
#> [35] Rhdf5lib_1.18.2 HDF5Array_1.24.1
#> [37] abind_1.4-5 scales_1.2.0
#> [39] DBI_1.1.2 rstatix_0.7.0
#> [41] Rcpp_1.0.8.3 viridisLite_0.4.0
#> [43] progress_1.2.2 bit_4.0.4
#> [45] htmlwidgets_1.5.4 httr_1.4.3
#> [47] RColorBrewer_1.1-3 ellipsis_0.3.2
#> [49] farver_2.1.0 R.methodsS3_1.8.1
#> [51] pkgconfig_2.0.3 XML_3.99-0.9
#> [53] sass_0.4.1 dbplyr_2.2.0
#> [55] utf8_1.2.2 labeling_0.4.2
#> [57] reshape2_1.4.4 tidyselect_1.1.2
#> [59] rlang_1.0.2 AnnotationDbi_1.58.0
#> [61] munsell_0.5.0 tools_4.2.0
#> [63] cachem_1.0.6 cli_3.3.0
#> [65] generics_0.1.2 RSQLite_2.2.14
#> [67] devtools_2.4.3 broom_0.8.0
#> [69] evaluate_0.15 stringr_1.4.0
#> [71] fastmap_1.1.0 yaml_2.3.5
#> [73] processx_3.5.3 knitr_1.39
#> [75] bit64_4.0.5 fs_1.5.2
#> [77] purrr_0.3.4 KEGGREST_1.36.2
#> [79] nlme_3.1-157 sparseMatrixStats_1.8.0
#> [81] formatR_1.12 R.oo_1.24.0
#> [83] xml2_1.3.3 brio_1.1.3
#> [85] compiler_4.2.0 filelock_1.0.2
#> [87] curl_4.3.2 png_0.1-7
#> [89] testthat_3.1.4 ggsignif_0.6.3
#> [91] tibble_3.1.7 bslib_0.3.1
#> [93] stringi_1.7.6 highr_0.9
#> [95] ps_1.7.0 desc_1.4.1
#> [97] lattice_0.20-45 Matrix_1.4-1
#> [99] vctrs_0.4.1 pillar_1.7.0
#> [101] lifecycle_1.0.1 rhdf5filters_1.8.0
#> [103] jquerylib_0.1.4 cowplot_1.1.1
#> [105] bitops_1.0-7 R6_2.5.1
#> [107] BiocIO_1.6.0 sessioninfo_1.2.2
#> [109] codetools_0.2-18 assertthat_0.2.1
#> [111] pkgload_1.2.4 rhdf5_2.40.0
#> [113] rprojroot_2.0.3 rjson_0.2.21
#> [115] withr_2.5.0 GenomicAlignments_1.32.0
#> [117] Rsamtools_2.12.0 GenomeInfoDbData_1.2.8
#> [119] mgcv_1.8-40 parallel_4.2.0
#> [121] hms_1.1.1 grid_4.2.0
#> [123] tidyr_1.2.0 rmarkdown_2.14
#> [125] DelayedMatrixStats_1.18.0 carData_3.0-5
#> [127] restfulr_0.0.14